library(magrittr)
library(plyr)
library(tidyverse)
library(readxl)
library(vietnamcode)


rm(list=ls())
home = 'C:/Users/Jason/Dropbox/VNA_Responsiveness/Analysis/JOP-dataverse/'


papi = paste0(home, 'papi-education-formatted-2017.xlsx') %>%
  read_xlsx
pci  = paste0(home, 'pci-education-formatted-2017.xlsx') %>%
  read_xlsx
provinces = papi$province
papi = papi %>%
  subset(select=-c(province,
                   header)) %>%
  prcomp(scale.=T)
pci = pci %>%
  subset(select=-c(province,
                   header)) %>%
  prcomp(scale.=T)
combined = data.frame(PAPI=papi$x[,'PC1'], 
                      PCI=pci$x[,'PC1'], 
                      Province=provinces) %>%
  mutate(PAPI=PAPI*(-1), 
         PCI=PCI*(-1))


survey = paste0(home, 'survey-outcomes.xlsx') %>%
  read_xlsx %>%
  subset(!is.na(Treatment))
pooled_texts = paste0(home, 'pooled-outcomes.xlsx') %>%
  read_xlsx %>%
  mutate(Treatment=factor(x=Treatment, 
                          levels=c('Control',
                                   'Citizen',
                                   'Firm'))) %>%
  subset(!is.na(Treatment)) %>%
  ddply(.variables='Province', 
        .fun=function(x) {
          df = colMeans(x=x[,c('FullTime','CentNom','Competitive')]) %>%
            matrix(nrow=1) %>%
            as.data.frame %>%
            set_colnames(c('FullTime','CentNom','Competitive')) %>%
            mutate(Citizen=sum(x$Treatment=='Citizen')/nrow(x),
                   Firm=sum(x$Treatment=='Firm')/nrow(x))
          df = apply(X=x[,match(x='said_own_province', 
                                table=colnames(x)):match(x='Spoke', 
                                                         table=colnames(x))], 
                     MARGIN=2, 
                     FUN=function(y) as.integer(any(y>0)) ) %>%
            matrix(nrow=1) %>%
            as.data.frame %>%
            set_colnames(paste0('any_', colnames(x)[match(x='said_own_province', 
                                                          table=colnames(x)):match(x='Spoke', 
                                                                                   table=colnames(x))])) %>%
            cbind(df, .) %>%
            mutate(LogNchar=sum(x$LogNchar))
          df = colSums(x=x[,match(x='said_own_province', 
                                  table=colnames(x)):match(x='Spoke', 
                                                           table=colnames(x))]) %>%
            matrix(nrow=1) %>%
            as.data.frame %>%
            set_colnames(paste0('count_', colnames(x)[match(x='said_own_province', 
                                                            table=colnames(x)):match(x='Spoke', 
                                                                                     table=colnames(x))])) %>%
            cbind(df, .)
          colMeans(x=x[,match(x='said_own_province', 
                              table=colnames(x)):match(x='Spoke', 
                                                       table=colnames(x))]) %>%
            matrix(nrow=1) %>%
            as.data.frame %>%
            set_colnames(paste0('prop_', colnames(x)[match(x='said_own_province', 
                                                           table=colnames(x)):match(x='Spoke', 
                                                                                    table=colnames(x))])) %>%
            cbind(df, .)
        })
caucus_texts = paste0(home, 'caucus-outcomes.xlsx') %>%
  read_xlsx %>%
  mutate(Treatment=factor(x=Treatment, 
                          levels=c('Control',
                                   'Citizen',
                                   'Firm'))) %>%
  subset(!is.na(Treatment)) %>%
  ddply(.variables='Province', 
        .fun=function(x) {
          df = colMeans(x=x[,c('FullTime','CentNom','Competitive')]) %>%
            matrix(nrow=1) %>%
            as.data.frame %>%
            set_colnames(c('FullTime','CentNom','Competitive')) %>%
            mutate(Citizen=sum(x$Treatment=='Citizen')/nrow(x),
                   Firm=sum(x$Treatment=='Firm')/nrow(x))
          df = apply(X=x[,match(x='said_own_province', 
                                table=colnames(x)):match(x='Spoke', 
                                                         table=colnames(x))], 
                     MARGIN=2, 
                     FUN=function(y) as.integer(any(y>0)) ) %>%
            matrix(nrow=1) %>%
            as.data.frame %>%
            set_colnames(paste0('any_', colnames(x)[match(x='said_own_province', 
                                                          table=colnames(x)):match(x='Spoke', 
                                                                                   table=colnames(x))])) %>%
            cbind(df, .) %>%
            mutate(LogNchar=sum(x$LogNchar))
          df = colSums(x=x[,match(x='said_own_province', 
                                  table=colnames(x)):match(x='Spoke', 
                                                           table=colnames(x))]) %>%
            matrix(nrow=1) %>%
            as.data.frame %>%
            set_colnames(paste0('count_', colnames(x)[match(x='said_own_province', 
                                                            table=colnames(x)):match(x='Spoke', 
                                                                                     table=colnames(x))])) %>%
            cbind(df, .)
          colMeans(x=x[,match(x='said_own_province', 
                              table=colnames(x)):match(x='Spoke', 
                                                       table=colnames(x))]) %>%
            matrix(nrow=1) %>%
            as.data.frame %>%
            set_colnames(paste0('prop_', colnames(x)[match(x='said_own_province', 
                                                           table=colnames(x)):match(x='Spoke', 
                                                                                    table=colnames(x))])) %>%
            cbind(df, .)
        })
query_texts = paste0(home, 'query-outcomes.xlsx') %>%
  read_xlsx %>%
  mutate(Treatment=factor(x=Treatment, 
                          levels=c('Control',
                                   'Citizen',
                                   'Firm'))) %>%
  subset(!is.na(Treatment)) %>%
  ddply(.variables='Province', 
        .fun=function(x) {
          df = colMeans(x=x[,c('FullTime','CentNom','Competitive')]) %>%
            matrix(nrow=1) %>%
            as.data.frame %>%
            set_colnames(c('FullTime','CentNom','Competitive')) %>%
            mutate(Citizen=sum(x$Treatment=='Citizen')/nrow(x),
                   Firm=sum(x$Treatment=='Firm')/nrow(x))
          df = apply(X=x[,match(x='said_own_province', 
                                table=colnames(x)):match(x='Spoke', 
                                                         table=colnames(x))], 
                     MARGIN=2, 
                     FUN=function(y) as.integer(any(y>0)) ) %>%
            matrix(nrow=1) %>%
            as.data.frame %>%
            set_colnames(paste0('any_', colnames(x)[match(x='said_own_province', 
                                                          table=colnames(x)):match(x='Spoke', 
                                                                                   table=colnames(x))])) %>%
            cbind(df, .) %>%
            mutate(LogNchar=sum(x$LogNchar))
          df = colSums(x=x[,match(x='said_own_province', 
                                  table=colnames(x)):match(x='Spoke', 
                                                           table=colnames(x))]) %>%
            matrix(nrow=1) %>%
            as.data.frame %>%
            set_colnames(paste0('count_', colnames(x)[match(x='said_own_province', 
                                                            table=colnames(x)):match(x='Spoke', 
                                                                                     table=colnames(x))])) %>%
            cbind(df, .)
          colMeans(x=x[,match(x='said_own_province', 
                              table=colnames(x)):match(x='Spoke', 
                                                       table=colnames(x))]) %>%
            matrix(nrow=1) %>%
            as.data.frame %>%
            set_colnames(paste0('prop_', colnames(x)[match(x='said_own_province', 
                                                           table=colnames(x)):match(x='Spoke', 
                                                                                    table=colnames(x))])) %>%
            cbind(df, .)
        })
floor_texts = paste0(home, 'floor-outcomes.xlsx') %>%
  read_xlsx %>%
  mutate(Treatment=factor(x=Treatment, 
                          levels=c('Control',
                                   'Citizen',
                                   'Firm'))) %>%
  subset(!is.na(Treatment)) %>%
  ddply(.variables='Province', 
        .fun=function(x) {
          df = colMeans(x=x[,c('FullTime','CentNom','Competitive')]) %>%
            matrix(nrow=1) %>%
            as.data.frame %>%
            set_colnames(c('FullTime','CentNom','Competitive')) %>%
            mutate(Citizen=sum(x$Treatment=='Citizen')/nrow(x),
                   Firm=sum(x$Treatment=='Firm')/nrow(x))
          df = apply(X=x[,match(x='said_citizen_related', 
                                table=colnames(x)):match(x='Spoke', 
                                                         table=colnames(x))], 
                     MARGIN=2, 
                     FUN=function(y) as.integer(any(y>0)) ) %>%
            matrix(nrow=1) %>%
            as.data.frame %>%
            set_colnames(paste0('any_', colnames(x)[match(x='said_citizen_related', 
                                                          table=colnames(x)):match(x='Spoke', 
                                                                                   table=colnames(x))])) %>%
            cbind(df, .) %>%
            mutate(LogNchar=sum(x$LogNchar))
          df = colSums(x=x[,match(x='said_citizen_related', 
                                  table=colnames(x)):match(x='Spoke', 
                                                           table=colnames(x))]) %>%
            matrix(nrow=1) %>%
            as.data.frame %>%
            set_colnames(paste0('count_', colnames(x)[match(x='said_citizen_related', 
                                                            table=colnames(x)):match(x='Spoke', 
                                                                                     table=colnames(x))])) %>%
            cbind(df, .)
          colMeans(x=x[,match(x='said_citizen_related', 
                              table=colnames(x)):match(x='Spoke', 
                                                       table=colnames(x))]) %>%
            matrix(nrow=1) %>%
            as.data.frame %>%
            set_colnames(paste0('prop_', colnames(x)[match(x='said_citizen_related', 
                                                           table=colnames(x)):match(x='Spoke', 
                                                                                    table=colnames(x))])) %>%
            cbind(df, .)
        })


data("vietnamcode_data")
combined = combined %>%
  mutate(Province=mapvalues(x=as.character(Province), 
                            from=vietnamcode_data$province_name_diacritics, 
                            to=vietnamcode_data$province_name,
                            warn_missing=F))
combined$Province[str_detect(string=combined$Province, 
                             pattern='B\u00E0 R\u1ECBa\u2013V\u0169ng T\u00E0u')] = 'Ba Ria Vung Tau'
combined$Province[str_detect(string=combined$Province, 
                             pattern='Th\u1EEBa Thi\u00EAn\u2013Hu\u1EBF')] = 'Thua Thien - Hue'
combined$Province = mapvalues(x=combined$Province, 
                              from=sort(setdiff(combined$Province, unique(survey$Province))), 
                              to=sort(setdiff(unique(survey$Province), combined$Province)))
survey = merge(x=survey, 
               y=combined, 
               by='Province', 
               all.x=T)
pooled_texts = merge(x=pooled_texts, 
                     y=combined, 
                     by='Province', 
                     all.x=T)
floor_texts = merge(x=floor_texts, 
                    y=combined, 
                    by='Province', 
                    all.x=T)
query_texts = merge(x=query_texts, 
                    y=combined, 
                    by='Province', 
                    all.x=T)
caucus_texts = merge(x=caucus_texts, 
                     y=combined, 
                     by='Province', 
                     all.x=T)
not_query_texts = apply(X=floor_texts[,paste0('any_said_', c('citizens','firms','objectives','vocational','teachers','fees'))] + 
                          caucus_texts[,paste0('any_said_', c('citizens','firms','objectives','vocational','teachers','fees'))], 
                        MARGIN=2, 
                        FUN=pmin, 1) %>%
  as.data.frame %>%
  cbind(subset(pooled_texts, select=c(Province, 
                                      FullTime,
                                      CentNom,
                                      Competitive,
                                      Citizen,
                                      Firm,
                                      PAPI,
                                      PCI)), .)



fit_1 = lm(any_said_own_province ~ PAPI + PCI + Citizen + Firm + FullTime + CentNom + Competitive, 
           data=pooled_texts)
fit_2 = lm(any_said_own_province ~ PAPI + PCI + Citizen + Firm + FullTime + CentNom + Competitive, 
           data=query_texts)
fit_3 = lm(any_said_own_province ~ PAPI + PCI + Citizen + Firm + FullTime + CentNom + Competitive, 
           data=caucus_texts)
fit_4 = lm(any_said_citizens ~ PAPI + PCI + Citizen + Firm + FullTime + CentNom + Competitive, 
           data=not_query_texts)
fit_5 = lm(any_said_firms ~ PAPI + PCI + Citizen + Firm + FullTime + CentNom + Competitive, 
           data=not_query_texts)
fit_6 = lm(any_said_objectives ~ PAPI + PCI + Citizen + Firm + FullTime + CentNom + Competitive, 
           data=not_query_texts)
fit_7 = lm(any_said_vocational ~ PAPI + PCI + Citizen + Firm + FullTime + CentNom + Competitive, 
           data=not_query_texts)
fit_8 = lm(any_said_teachers ~ PAPI + PCI + Citizen + Firm + FullTime + CentNom + Competitive, 
           data=not_query_texts)
fit_9 = lm(any_said_fees ~ PAPI + PCI + Citizen + Firm + FullTime + CentNom + Competitive, 
           data=not_query_texts)

stargazer(fit_1, 
          fit_2, 
          fit_3, 
          fit_4, 
          fit_5, 
          fit_6, 
          fit_7, 
          fit_8, 
          fit_9, 
          title='Analyzing speech content with PCA. PAPI and PCI scores have no effect on delegation-level speaking behavior.', 
          align=T, 
          header=F,
          no.space=T, 
          star.cutoffs=NA,
          column.labels=c('Mentioned own province','Citizens','Firms','Art. 27-29','Art. 31','Art. 70','Art. 105'),
          column.separate=c(3, 1, 1, 1, 1, 1, 1),
          dep.var.labels.include=F,
          label='appendix-table-06.2',
          notes.append=F, 
          notes='Transcripts from floor debates and group caucuses only. The unit of analysis is the province.') %>%
  writeClipboard
